{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "363193dd",
   "metadata": {},
   "source": [
    "### Remove genes correlated with technical or biological covariates "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0a531208",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from scipy.stats import spearmanr\n",
    "from pathlib import Path\n",
    "from statsmodels.stats import multitest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "11192987",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "# inputs \n",
    "cov_corr_dir = wkdir_path.joinpath(\"2_misexp_qc/misexp_cov_corr\")\n",
    "inactive_genes_path = wkdir_path.joinpath(\"1_rna_seq_qc/gene_sets/inactive_genes_8779.txt\")\n",
    "zscore_tpm_flat_path = wkdir_path.joinpath(\"1_rna_seq_qc/zscore_tpm_flat/tpm_zscore_4568smpls_8739genes_tpm0.1_frac_5.0perc_flat.csv\")\n",
    "# output directory \n",
    "out_dir = wkdir_path.joinpath(\"2_misexp_qc/misexp_gene_cov_corr\")\n",
    "out_dir_path = Path(out_dir)\n",
    "out_dir_path.mkdir(parents=True, exist_ok=True)\n",
    "# variables \n",
    "spearman_rho_cutoff = 0.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "dd38911a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of inactive genes: 8779\n"
     ]
    }
   ],
   "source": [
    "inactive_genes = pd.read_csv(inactive_genes_path, sep=\"\\t\", header=None)[0].tolist()\n",
    "num_inactive_genes = len(set(inactive_genes))\n",
    "print(f\"Number of inactive genes: {num_inactive_genes}\")\n",
    "\n",
    "misexp_cov_corr_path = wkdir_path.joinpath(f\"2_misexp_qc/misexp_cov_corr/interval_gene_cov_corr.tsv\")\n",
    "misexp_cov_corr_df = pd.read_csv(misexp_cov_corr_path, sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0af69dfa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes with correlations: 8779\n",
      "Number of covariates with correlations: 225\n"
     ]
    }
   ],
   "source": [
    "print(f\"Number of genes with correlations: {len(misexp_cov_corr_df.gene_id.unique())}\")\n",
    "print(f\"Number of covariates with correlations: {len(misexp_cov_corr_df.covariate.unique())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "653d3f44",
   "metadata": {},
   "outputs": [],
   "source": [
    "# multiple testing correction \n",
    "misexp_cov_corr_nonan_df = misexp_cov_corr_df[~misexp_cov_corr_df.spearman.isna()].copy()\n",
    "pval_as_array = misexp_cov_corr_nonan_df.pval.to_numpy()\n",
    "for method in [\"fdr_bh\", \"bonferroni\"]:\n",
    "    pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method=method)\n",
    "    misexp_cov_corr_nonan_df[f\"{method}_pass\"] = pass_test\n",
    "    misexp_cov_corr_nonan_df[f\"{method}_pval_adj\"] = pval_adj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "87fda17d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes failing QC: 129\n"
     ]
    }
   ],
   "source": [
    "# genes fail \n",
    "misexp_cov_corr_nonan_fail_df = misexp_cov_corr_nonan_df[(misexp_cov_corr_nonan_df.spearman.abs() > spearman_rho_cutoff) &\n",
    "                                                    (misexp_cov_corr_nonan_df.fdr_bh_pass)]\n",
    "gene_id_fail_cutoff = misexp_cov_corr_nonan_fail_df.gene_id.unique()\n",
    "print(f\"Number of genes failing QC: {len(gene_id_fail_cutoff)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6d0a0723",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write genes to file \n",
    "genes_corr_tech_covs_path = out_dir_path.joinpath(\"genes_corr_tech_covs.txt\")\n",
    "with open(genes_corr_tech_covs_path, 'w') as f_out: \n",
    "    for gene_id in gene_id_fail_cutoff: \n",
    "        f_out.write(f\"{gene_id}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6d8577e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove genes correlated with technical covariates \n",
    "zscore_tpm_flat_df = pd.read_csv(zscore_tpm_flat_path, sep=\",\")\n",
    "zscore_tpm_flat_rmvd_genes_df = zscore_tpm_flat_df[~zscore_tpm_flat_df.gene_id.isin(gene_id_fail_cutoff)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bf2cc79e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes remaining in matrix: 8610\n",
      "Number of samples remaining in matrix: 4568\n"
     ]
    }
   ],
   "source": [
    "num_gene_id_z_tpm_flat_remaining = zscore_tpm_flat_rmvd_genes_df.gene_id.nunique()\n",
    "print(f\"Number of genes remaining in matrix: {num_gene_id_z_tpm_flat_remaining}\")\n",
    "# this number differs to the one above as we removed 40 genes with TPM=0 across all \n",
    "# samples to calculate z-scores \n",
    "number_smpls = zscore_tpm_flat_rmvd_genes_df.rna_id.nunique()\n",
    "print(f\"Number of samples remaining in matrix: {number_smpls}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a50ca65d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of inactive genes remaining: 8650\n"
     ]
    }
   ],
   "source": [
    "# write inactive genes passing QC to file \n",
    "inactive_genes_pass_qc_path = wkdir_path.joinpath(\"2_misexp_qc/misexp_gene_cov_corr/gene_id_post_tech_cov_qc_8650.txt\")\n",
    "inactive_genes_pass_qc = set(inactive_genes) - set(gene_id_fail_cutoff)\n",
    "print(f\"Number of inactive genes remaining: {len(inactive_genes_pass_qc)}\")\n",
    "with open(inactive_genes_pass_qc_path, \"w\") as f_out: \n",
    "    for gene_id in inactive_genes_pass_qc: \n",
    "        f_out.write(f\"{gene_id}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9870840e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write remaining genes to file \n",
    "xcell_gene_id_corr_pval = out_dir_path.joinpath(f\"tpm_zscore_{number_smpls}smpls_{num_gene_id_z_tpm_flat_remaining}genes_flat_misexp_corr_qc.csv\")\n",
    "zscore_tpm_flat_rmvd_genes_df.to_csv(xcell_gene_id_corr_pval, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
